Baseline System 1: Hierarchical phrase-based SMT
Here, "fr" represents the source language and "en" represents the target language.
export SCRIPTS_ROOTDIR=${pathName}/moses-scripts/scripts-XXXXXXXX-XXXX
MOSES_CHART_CMD=${pathName}/moses/trunk/moses-chart-cmd/src/moses_chart
MOSES_CMD=${pathName}/moses/trunk/moses-cmd/src/moses
MERT_DIR=${pathName}/moses/trunk/mert
GIZA_BIN_DIR=${pathName}/giza-pp/bin
SRILM_DIR=${pathName}/srilm/bin/i686-m64
SCRIPTS_DIR=${pathName}/scripts
Build Language Model
mkdir lm
${SRILM_DIR}/ngram-count -order 5 -interpolate -kndiscount -unk -text corpus.tok/train-all.tok.lower.en -lm lm/train-all.en.lm
Train Model
mkdir hierarModel
cd hierarModel/
mkdir work
${SCRIPTS_ROOTDIR}/training/train-model.perl \
--scripts-root-dir ${SCRIPTS_ROOTDIR} \
--root-dir `pwd`/work \
--bin-dir ${GIZA_BIN_DIR} \
--corpus ../corpus.tok/train-all.clean1-40 \
--f fr \
--e en \
--parallel \
--alignment grow-diag-final \
--hierarchical \
--glue-grammar \
--extract-options="--MinHoleSource 1" \
--lm 0:5:`pwd`/../lm/train-all.en.lm \
>& work/training.out
Tuning
mkdir -p work/tuning.dev
${SCRIPTS_ROOTDIR}/training/filter-model-given-input.pl \
work/tuning.dev/filtered \
work/model/moses.ini \
../corpus.tok/dev.tok.500.fr \
-Hierarchical
mkdir -p work/tuning
${SCRIPTS_ROOTDIR}/training/mert-moses.pl \
../corpus.tok/dev.tok.500.fr \
../corpus.tok/dev.tok.500.en \
${MOSES_CHART_CMD} \
`pwd`/work/tuning.dev/filtered/moses.ini \
--no-filter-phrase-table \
--rootdir ${SCRIPTS_ROOTDIR} \
--mertdir ${MERT_DIR} \
--working-dir work/tuning/mert \
>& work/tuning/mert.out
Insert weights into configuration file. ${SCRIPTS_DIR}/reuse-weights.perl \
work/tuning/mert/moses.ini < work/model/moses.ini > work/tuning/moses-tuned.ini
Run Tuned Decoder
mkdir -p work/evaluation
mkdir -p work/output
${SCRIPTS_ROOTDIR}/training/filter-model-given-input.pl \
work/evaluation/filtered \
work/tuning/moses-tuned.ini \
../corpus.tok/test.tok.fr \
-Hierarchical
cat ../corpus.tok/test.tok.fr | \
${MOSES_CHART_CMD} \
-f work/evaluation/filtered/moses.ini \
> work/output/test.output.txt
Train Recaser (for English)
mkdir -p work/recaser
${SCRIPTS_ROOTDIR}/recaser/train-recaser.perl \
-train-script ${SCRIPTS_ROOTDIR}/training/train-model.perl \
-ngram-count ${SRILM_DIR}/ngram-count \
-corpus ../corpus.tok/train-all.tok.en \
-dir `pwd`/work/recaser \
-scripts-root-dir ${SCRIPTS_ROOTDIR}
Recase the output (for English)
${SCRIPTS_ROOTDIR}/recaser/recase.perl \
-model work/recaser/moses.ini \
-in work/output/test.output.txt \
-moses ${MOSES_CMD} \
> work/output/test.output.recased
Detokenize the output
For English ${SCRIPTS_DIR}/detokenizer.perl -l en < work/output/test.output.recased > work/output/test.output.detokenized
For Japanese cat work/output/test.output.txt | \
perl -pe 's/ //g;' > work/output/test.output.detokenized